{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "cbe5fae9",
   "metadata": {},
   "source": [
    "# Getting transcripts from Ukrainian government website"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaf76d30",
   "metadata": {},
   "source": [
    "Run the following command in the terminal to install the versions we use in this notebook:\n",
    "\n",
    "    pip install pandas=='1.3.3' beautifulsoup4=='4.11.1' selenium=='4.3.0' webdriver_manager=='3.7.1' tqdm=='4.62.2'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "697024c9",
   "metadata": {},
   "source": [
    "Importing packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "16766d20",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[WDM] - ====== WebDriver manager ======\n",
      "[WDM] - Current google-chrome version is 103.0.5060\n",
      "[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome\n",
      "[WDM] - Driver [/Users/maiapolo/.wdm/drivers/chromedriver/mac64/103.0.5060.134/chromedriver] found in cache\n",
      "/usr/local/anaconda/envs/stats600/lib/python3.7/site-packages/ipykernel_launcher.py:6: DeprecationWarning: executable_path has been deprecated, please pass in a Service object\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup as bs\n",
    "from tqdm import tqdm\n",
    "from selenium import webdriver\n",
    "from webdriver_manager.chrome import ChromeDriverManager\n",
    "import pandas as pd\n",
    "driver = webdriver.Chrome(ChromeDriverManager().install())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80a9e264",
   "metadata": {},
   "source": [
    "Getting texts URLs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bee58fcc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 28/28 [00:31<00:00,  1.12s/it]\n"
     ]
    }
   ],
   "source": [
    "links = []\n",
    "\n",
    "for i in tqdm(range(1,29)):\n",
    "    \n",
    "    driver.get(\"https://www.president.gov.ua/en/news/speeches?page=\"+str(i))\n",
    "\n",
    "    source=str(driver.page_source)\n",
    "    txt = source\n",
    "    ind, ind2, ind3 = 0, 0, 0\n",
    "\n",
    "    while True:\n",
    "        txt=txt[(ind+ind2+ind3):]\n",
    "\n",
    "        ind=txt.find('item_stat_headline')\n",
    "        ind2=txt[ind:].find('https://')\n",
    "        ind3=txt[(ind+ind2):].find('\">')\n",
    "\n",
    "        links.append([txt[(ind+ind2):(ind+ind2+ind3)]])\n",
    "\n",
    "        if links[-1]==['']: \n",
    "            links=links[:-1]\n",
    "            break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ccab1668",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "273"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(links)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c5b6cc1",
   "metadata": {},
   "source": [
    "Getting texts:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6cce7835",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 273/273 [08:22<00:00,  1.84s/it]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(links))):\n",
    "\n",
    "    driver.get(links[i][0])\n",
    "\n",
    "    source=str(driver.page_source)\n",
    "    txt = source\n",
    "    \n",
    "    ind=txt.rfind('\"date\"')+7\n",
    "    ind2=txt[ind:].find('</p>')\n",
    "    date=txt[ind:(ind+ind2)]\n",
    "    date=\" \".join(date.split())\n",
    "\n",
    "    ind3=source[(ind+ind2):].find('title=\"\" alt=\"')+14\n",
    "    ind4=source[(ind+ind2+ind3):].find('\">')\n",
    "    title=source[(ind+ind2+ind3):(ind+ind2+ind3+ind4)]\n",
    "\n",
    "    soup = bs(source[(ind+ind2):], features=\"html.parser\")\n",
    "    text = soup.get_text()\n",
    "    ind5=text.rfind('\\n\\n\\n\\n\\n\\nv')\n",
    "    text = text[:ind5]\n",
    "    text=text.replace(\"\\n\", \" \")\n",
    "    text=\" \".join(text.split())\n",
    "\n",
    "    links[i]+=[date,title,text]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6f6bacbf",
   "metadata": {},
   "source": [
    "Saving CSV:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8464ed73",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {'url':[l[0] for l in links], 'date':[l[1] for l in links], 'title':[l[2] for l in links], 'text':[l[3] for l in links]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "06a3520f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data=pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "016cbfdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_csv('data/zelensky_speeches.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
